Ta Feng Grocery Dataset
The dataset contains a Chinese grocery store transaction data from November 2000 to February 2001. Column definition: Transaction date and time (no timestamp), Customer ID, Age Group, PIN Code, Product subclass, Product ID, Amount, Asset, Sales price.
此資訊僅作為後續進行行銷企劃的假設基礎
主要營業項目:
專業生產各種果汁機、搾汁機、攪拌機、研磨機以及所有機種使用之小型馬達,塑膠杯射出以及不銹鋼濾網,並於廠內進行所有組裝、品管。主要產品:
果汁機、多功能三合一、多功能果菜搾汁機、手動攪拌機、全功能高氧食品養生機、多功能性榨汁機、多功能性果汁機、攪拌機、豆漿機、粉碎機、冰沙機、通用馬達等。
Data Aggregation and manipulation TF
TF = read.csv("/Users/shuushoden/Desktop/Final/ta_feng_all_months_merged.csv", stringsAsFactors = FALSE, header = TRUE)
names(TF) = c("date","cid","age","area","cat","pid","amount","cost","price") #817741TF$date = as.Date(TF$date, orgin = "11/1/2000", format = "%m/%d/%Y")
TF$date1 = format(TF$date, "%Y-%m-%d") %>% as.Date
TF$age[is.na(TF$age)] = "na"
TF$age_group = TF$age #age range
TF$age = factor(TF$age, levels = c("<25","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64",">65","na"), labels=c(
"a20","a25","a30","a35","a40","a45","a50","a55","a60","a65","na")) %>% as.character
TF$now = as.Date(max(TF$date1) + 1) # use as the endpoint of time to calculate the recency
TF$area = paste0("x",TF$area)
TF$oid = group_indices(TF, date1, cid) #訂單資料
summary(TF) date cid age area
Min. :2000-11-01 Min. : 1069 Length:817741 Length:817741
1st Qu.:2000-11-28 1st Qu.: 969222 Class :character Class :character
Median :2001-01-01 Median : 1587722 Mode :character Mode :character
Mean :2000-12-30 Mean : 1406620
3rd Qu.:2001-01-30 3rd Qu.: 1854930
Max. :2001-02-28 Max. :20002000
cat pid amount cost
Min. :100101 Min. : 20008819 Min. : 1.00 Min. : 0
1st Qu.:110106 1st Qu.:4710085127020 1st Qu.: 1.00 1st Qu.: 35
Median :130106 Median :4710421090060 Median : 1.00 Median : 62
Mean :284950 Mean :4461639280530 Mean : 1.38 Mean : 112
3rd Qu.:520314 3rd Qu.:4712500125130 3rd Qu.: 1.00 3rd Qu.: 112
Max. :780510 Max. :9789579967620 Max. :1200.00 Max. :432000
price date1 age_group now
Min. : 1 Min. :2000-11-01 Length:817741 Min. :2001-03-01
1st Qu.: 42 1st Qu.:2000-11-28 Class :character 1st Qu.:2001-03-01
Median : 76 Median :2001-01-01 Mode :character Median :2001-03-01
Mean : 132 Mean :2000-12-30 Mean :2001-03-01
3rd Qu.: 132 3rd Qu.:2001-01-30 3rd Qu.:2001-03-01
Max. :444000 Max. :2001-02-28 Max. :2001-03-01
oid
Min. : 1
1st Qu.: 28802
Median : 59475
Mean : 58922
3rd Qu.: 87521
Max. :119578
Check Quantile and Remove Outliers
#check out the outliers on the amoun, cost and prcie
sapply(TF[,7:9], quantile, prob = c(0.99, 0.999, 0.9995)) amount cost price
99% 6 858.0 1014.0
99.9% 14 2722.0 3135.8
99.95% 24 3799.3 3999.0
# outliers removal(99.95%)
TF = subset(TF, amount <= 24 & cost <= 3799.3 & price <= 3999)
summary(TF) date cid age area
Min. :2000-11-01 Min. : 1069 Length:817181 Length:817181
1st Qu.:2000-11-28 1st Qu.: 968775 Class :character Class :character
Median :2001-01-01 Median : 1587685 Mode :character Mode :character
Mean :2000-12-30 Mean : 1406500
3rd Qu.:2001-01-30 3rd Qu.: 1854701
Max. :2001-02-28 Max. :20002000
cat pid amount cost
Min. :100101 Min. : 20008819 Min. : 1.00 Min. : 0
1st Qu.:110106 1st Qu.:4710085127020 1st Qu.: 1.00 1st Qu.: 35
Median :130106 Median :4710421090060 Median : 1.00 Median : 62
Mean :284784 Mean :4461978474490 Mean : 1.36 Mean : 106
3rd Qu.:520311 3rd Qu.:4712500125130 3rd Qu.: 1.00 3rd Qu.: 112
Max. :780510 Max. :9789579967620 Max. :24.00 Max. :3798
price date1 age_group now
Min. : 1 Min. :2000-11-01 Length:817181 Min. :2001-03-01
1st Qu.: 42 1st Qu.:2000-11-28 Class :character 1st Qu.:2001-03-01
Median : 76 Median :2001-01-01 Mode :character Median :2001-03-01
Mean : 126 Mean :2000-12-30 Mean :2001-03-01
3rd Qu.: 132 3rd Qu.:2001-01-30 3rd Qu.:2001-03-01
Max. :3999 Max. :2001-02-28 Max. :2001-03-01
oid
Min. : 1
1st Qu.: 28812
Median : 59483
Mean : 58929
3rd Qu.: 87531
Max. :119578
Creating Transaction ID
cid cat pid tid
32256 2007 23789 119422
date cid age area
Min. :2000-11-01 Min. : 1069 Length:817181 Length:817181
1st Qu.:2000-11-28 1st Qu.: 968775 Class :character Class :character
Median :2001-01-01 Median : 1587685 Mode :character Mode :character
Mean :2000-12-30 Mean : 1406500
3rd Qu.:2001-01-30 3rd Qu.: 1854701
Max. :2001-02-28 Max. :20002000
cat pid amount cost
Min. :100101 Min. : 20008819 Min. : 1.00 Min. : 0
1st Qu.:110106 1st Qu.:4710085127020 1st Qu.: 1.00 1st Qu.: 35
Median :130106 Median :4710421090060 Median : 1.00 Median : 62
Mean :284784 Mean :4461978474490 Mean : 1.36 Mean : 106
3rd Qu.:520311 3rd Qu.:4712500125130 3rd Qu.: 1.00 3rd Qu.: 112
Max. :780510 Max. :9789579967620 Max. :24.00 Max. :3798
price date1 age_group now
Min. : 1 Min. :2000-11-01 Length:817181 Min. :2001-03-01
1st Qu.: 42 1st Qu.:2000-11-28 Class :character 1st Qu.:2001-03-01
Median : 76 Median :2001-01-01 Mode :character Median :2001-03-01
Mean : 126 Mean :2000-12-30 Mean :2001-03-01
3rd Qu.: 132 3rd Qu.:2001-01-30 3rd Qu.:2001-03-01
Max. :3999 Max. :2001-02-28 Max. :2001-03-01
oid tid
Min. : 1 Min. : 1
1st Qu.: 28812 1st Qu.: 28783
Median : 59483 Median : 59391
Mean : 58929 Mean : 58845
3rd Qu.: 87531 3rd Qu.: 87391
Max. :119578 Max. :119422
交易相關資料 TF_oid
aggregation for transaction dataframe (group by orderid)
TF_oid = TF %>% group_by(oid) %>%
summarise(
date = date[1], # transaction date
cid = cid[1], # customer id
age = age[1], # customer age group
area = area[1], # pin code
items = n(), # the numbers of order/transaction
pieces = sum(amount), # the numbers of product? in a order
total = sum(price), # sales worth of a order
gross = sum(price - cost) # gross profit for a oder
) %>% as.data.frame()
nrow(TF_oid) #119422[1] 119422
Summary of transaction dataframe
oid date cid age
Min. : 1 Min. :2000-11-01 Min. : 1069 Length:119422
1st Qu.: 29887 1st Qu.:2000-11-29 1st Qu.: 927093 Class :character
Median : 59804 Median :2001-01-01 Median : 1615661 Mode :character
Mean : 59796 Mean :2000-12-31 Mean : 1402548
3rd Qu.: 89707 3rd Qu.:2001-02-02 3rd Qu.: 1894493
Max. :119578 Max. :2001-02-28 Max. :20002000
area items pieces total
Length:119422 Min. : 1.00 Min. : 1.00 Min. : 5
Class :character 1st Qu.: 2.00 1st Qu.: 3.00 1st Qu.: 227
Mode :character Median : 5.00 Median : 6.00 Median : 510
Mean : 6.84 Mean : 9.29 Mean : 859
3rd Qu.: 9.00 3rd Qu.: 12.00 3rd Qu.: 1082
Max. :112.00 Max. :339.00 Max. :30171
gross
Min. :-1645
1st Qu.: 21
Median : 68
Mean : 132
3rd Qu.: 169
Max. : 8069
Check Quantile and Remove Outliers
#check out the outliers on the amoun, cost and prcie
sapply(TF_oid[,6:9], quantile, prob = c(0.99, 0.999, 0.9995)) items pieces total gross
99% 33 46.00 4970.6 960.0
99.9% 54 81.00 9009.6 1824.7
99.95% 62 94.29 10611.6 2179.8
# outliers removal for items, pieces, total and gross (99.95%)
TF_oid = subset(TF_oid, items<=62 & pieces<= 94.29 & total<10611.6, gross <= 2179.8)
nrow(TF_oid) #119298 (TONY:119328)[1] 119298
par(mfrow=c(3,1), mar=c(2,6,4,1))
hist(TF$date, "weeks", freq=T, las=3, main="Num of transactions per Week")
hist(TF$date, "months", freq=T, las=3, main="Num of transactions per Month")顧客相關資料 TF_cus
Aggregation for customer dataframe
d0 = max(TF$date) +1
TF_cus = TF_oid %>% mutate(days = as.integer(difftime(d0, date, units = "days"))) %>%
group_by(cid) %>%
summarise(
r = min(days), # recency
f = n(), # frequency
m = mean(total), # monetary by mean
m_median = median(total), # monetary by median
s = max(days), # seniority
rev = sum(total), # total spending on a customer(revenue)
value = sum(gross), # the contrbution(worth) to seller
age = age[1], # age group of customer
area = area[1] # pin code where the customer locates
) %>% as.data.frame()
nrow(TF_cus) #32239;32241[1] 32239
summary for customer dataframe
cid r f m
Min. : 1069 Min. : 1.0 Min. : 1.0 Min. : 8
1st Qu.: 1088442 1st Qu.: 9.0 1st Qu.: 1.0 1st Qu.: 365
Median : 1663402 Median : 26.0 Median : 2.0 Median : 705
Mean : 1473559 Mean : 37.5 Mean : 3.7 Mean : 990
3rd Qu.: 1958036 3rd Qu.: 60.0 3rd Qu.: 4.0 3rd Qu.: 1290
Max. :20002000 Max. :120.0 Max. :85.0 Max. :10532
m_median s rev value
Min. : 8 Min. : 1.0 Min. : 8 Min. : -784
1st Qu.: 320 1st Qu.: 56.0 1st Qu.: 707 1st Qu.: 75
Median : 632 Median : 92.0 Median : 1749 Median : 241
Mean : 938 Mean : 80.8 Mean : 3140 Mean : 482
3rd Qu.: 1213 3rd Qu.:110.0 3rd Qu.: 3964 3rd Qu.: 611
Max. :10532 Max. :120.0 Max. :127686 Max. :20273
age area
Length:32239 Length:32239
Class :character Class :character
Mode :character Mode :character
Check NA values
found abnormal NA value on Mac
date cid age area cat pid amount cost
0 0 22346 0 0 0 0 0
price date1 age_group now oid tid
0 0 0 0 0 0
oid date cid age area items pieces total gross
0 0 0 4374 0 0 0 0 0
cid r f m m_median s rev value
0 0 0 0 0 0 0 0
age area
626 0
資料視覺與處理 Visualization and Manipulation
Look into the orginal dataset
由上圖分佈可知此公司的主要銷售熱點落在某一項產品,該產品類別的子類別產品應該也貢獻了一部份的銷售。 大部分的消費數量為1-5,推測此公司主要對消費端居多(B2C),假設販售的是終端產品。 經過對數轉換的消費金額分佈,眾數消費金額為100,分佈中可見較大的金額在1000的區間(此部分與我們假設的營銷內容較違和)。
此公司的主要客群落在30s,再者為40s。
界定顧客價值 Calculating Recency(R), Frequency(F), and Monetery(M) scores
Recency score:br> The score assigned to each customer based on the value of the Transaction Date selected on the Variables tab.
Higher scores are assigned to more recent dates or lower interval values.
Frequency score:
The score assigned to each customer based on the Number of Transactions variable selected on the Variables tab.
Higher scores are assigned to higher values.
Monetary score:
The score assigned to each customer based on the Amount variable selected on the Variables tab.
Higher scores are assigned to higher values.
RFM score:
The three individual scores combined into a single value: (recency100) + (frequency10) + monetary.
# find the quantile of r,f,m respectively and use it as five breaks of rfm levels
sapply(TF_cus[,2:4], quantile, probs = seq(0,1, 0.2)) r f m
0% 1 1 8.00
20% 7 1 309.04
40% 18 2 558.00
60% 38 3 890.00
80% 74 5 1487.00
100% 120 85 10532.00
TF_RFM = TF_cus %>%
mutate(R_level = as.factor(ifelse(between(r, 1, 7), 'very low',
ifelse(between(r, 7, 18), 'low',
ifelse(between(r, 18, 38), 'medium',
ifelse(between(r, 38, 74), 'high',
ifelse(between(r, 74, 120), 'very high', 'super high'))))))) %>%
mutate(F_level = as.factor(ifelse(between(f, 1, 1), 'very low',
ifelse(between(f, 1, 2), 'low',
ifelse(between(f, 2, 3), 'medium',
ifelse(between(f, 3, 5), 'high',
ifelse(between(f, 5, 85), 'very high', 'super high'))))))) %>%
mutate(M_level = as.factor(ifelse(between(m, 8, 309.04), 'very low',
ifelse(between(m, 309.04, 558), 'low',
ifelse(between(m, 558, 890), 'medium',
ifelse(between(m,890, 1487), 'high',
ifelse(between(m, 1487, 10532), 'very high', 'super high'))))))) %>%
mutate(R_score = as.numeric(ifelse(between(r, 1, 7), '1',
ifelse(between(r, 7, 18), '2',
ifelse(between(r, 18, 38), '3',
ifelse(between(r, 38, 74), '4',
ifelse(between(r, 74, 120), '5', '>5'))))))) %>%
mutate(F_score = as.numeric(ifelse(between(f, 1, 1), '1',
ifelse(between(f, 1, 2), '2',
ifelse(between(f, 2, 3), '3',
ifelse(between(f, 3, 5), '4',
ifelse(between(f, 5, 85), '5', '>5'))))))) %>%
mutate(M_score = as.numeric(ifelse(between(m, 8, 309.04), '1',
ifelse(between(m, 309.04, 558), '2',
ifelse(between(m, 558, 890), '3',
ifelse(between(m,890, 1487), '4',
ifelse(between(m, 1487, 10532), '5', '>5'))))))) %>% mutate(RFM_score = R_score*100 + F_score*10 + M_score)
summary(TF_RFM) cid r f m
Min. : 1069 Min. : 1.0 Min. : 1.0 Min. : 8
1st Qu.: 1088442 1st Qu.: 9.0 1st Qu.: 1.0 1st Qu.: 365
Median : 1663402 Median : 26.0 Median : 2.0 Median : 705
Mean : 1473559 Mean : 37.5 Mean : 3.7 Mean : 990
3rd Qu.: 1958036 3rd Qu.: 60.0 3rd Qu.: 4.0 3rd Qu.: 1290
Max. :20002000 Max. :120.0 Max. :85.0 Max. :10532
m_median s rev value
Min. : 8 Min. : 1.0 Min. : 8 Min. : -784
1st Qu.: 320 1st Qu.: 56.0 1st Qu.: 707 1st Qu.: 75
Median : 632 Median : 92.0 Median : 1749 Median : 241
Mean : 938 Mean : 80.8 Mean : 3140 Mean : 482
3rd Qu.: 1213 3rd Qu.:110.0 3rd Qu.: 3964 3rd Qu.: 611
Max. :10532 Max. :120.0 Max. :127686 Max. :20273
age area R_level F_level
Length:32239 Length:32239 high :6382 high : 4416
Class :character Class :character low :6271 low : 6295
Mode :character Mode :character medium :6224 medium : 3858
very high:6327 very high: 5774
very low :7035 very low :11896
M_level R_score F_score M_score RFM_score
high :6449 Min. :1.00 Min. :1.00 Min. :1 Min. :111
low :6455 1st Qu.:2.00 1st Qu.:1.00 1st Qu.:2 1st Qu.:215
medium :6441 Median :3.00 Median :2.00 Median :3 Median :324
very high:6446 Mean :2.96 Mean :2.56 Mean :3 Mean :325
very low :6448 3rd Qu.:4.00 3rd Qu.:4.00 3rd Qu.:4 3rd Qu.:425
Max. :5.00 Max. :5.00 Max. :5 Max. :555
[1] 32239
以上為計算個別顧客之近期購買(R)、購買頻率(F)以及平均消費金額(M)分布圖。 recency越高表示離今的上次消費越近;frequency越高表示過去4個月以來來店消費的頻率越高;monetary越高表示來店平均消費的金額越高。
在M跟F的分佈中有很多離群值需要處理。
r f m m_median
99.9% 119 52.762 7633.3 7633.3
99.95% 119 58.881 8132.7 8132.7
99.99% 120 78.329 9569.1 9569.1
# outliers removal for r, f, and m (99.95%)
TF_RFM = subset(TF_RFM, r <=119 & f<=58.8810 & m<=8132.656) # obs: 32239 -> 32198
nrow(TF_RFM)[1] 32198
#年齡層與週間購買熱圖
table(TF$age,format(TF$date,"%u"))%>%
{./rowSums(.)}%>%
as.data.frame.matrix%>%
d3heatmap(F,F,col="Greens")由此可發現可年齡層多數在星期天購買,而30歲左右的客群較明顯有該趨勢;星期三是多數客群不購買的日子,唯獨20歲左右客群是最少在星期五購買而非星期三。
集群處理 Clustering
K-means
1 2 3 4 5 6 7
6234 447 10051 4772 6248 1427 3019
這個方法求出最適分群數量為2,但不符合我們需求。還不錯的分群在3-8這個區間,看起來隨分群數增加分群效果會遞減。
- Clustering visualizations
group_by(TF_RFM, grp) %>% summarise(
recent=mean(r),
freq=mean(f),
money=mean(m),
size=n()
) %>%
mutate(revenue = size*money/1000) %>%
filter(size > 1) %>%
ggplot(aes(x=freq, y=money)) +
geom_point(aes(size=revenue, col=recent),alpha=0.5) +
scale_size(range=c(4,30)) +
scale_color_gradient(low="green",high="red") +
geom_text(aes(label = size ),size=3) +
scale_x_log10()+scale_y_log10()+
theme_bw() + guides(size=F) +
labs(title="Customer Segements",
subtitle="(bubble_size:revenue_contribution; text:group_size)",
color="Recency") +
xlab("Frequency (log)") + ylab("Average Transaction Amount (log)") 圖形解析:
將現有顧客分成七群,每個泡泡分別代表一群。
4種屬性,大小、顏色、X軸與Y軸可供判讀。
X軸:購買頻率。
Y軸:平均交易金額(客單價)。
泡泡大小:反映這群顧客對你的營收貢獻。
泡泡顏色:越紅就代表越久沒來買,可能快要流失了。
可以針對很常來買(頻率高),買很少(客單價低),去做行銷策略,擬定對這群顧客增加客單價的方法
解釋:
客群1(447人):來店購買頻率高,交易單價低,所佔人數亦不多
客群2(4770人):來的頻率不高,買的數量算多,交易單價為最高的,要想辦法留住他們。
客群3(5944人):來店頻率不高,交易數量亦不高,交易單價為中等。
客群4(9638人):交易數量低,最近也不常來,交易單價為中等。
客群5(3747人):較常來,營收貢獻不多,交易數量也不多,可想辦法提升其交易數量。
客群6(6225人):最近來有,但以往交易數量及頻率都低,需想辦法提升其來店次數。
客群7(1427人):交易數量極高,最近也有來,需想辦法增加其來店次數。
[!]
以各年齡層來看,多數為客群4(交易數量低,最近也不常來,交易單價為中等。)…其中又以65歲及20歲的族群為客群4的占比最高(說明該兩客群為上述特徵高)。
X114地區為客群4的比例最高,而X106佔重點客群2的比例高,來的頻率不高,買的數量算多,交易單價為最高的,故要想辦法留住他們。
以RFM界定規則分群
#設計分群規則 #以曝光為目的分類
STS<-c("A1","A2","B1","B2","C1","C2","D1","D2")
Status<-function(rc,fc,mc){factor(
ifelse(rc<4,
ifelse(fc<4,ifelse(mc<4,"A1","A2"),ifelse(mc<4,"B1","B2")),
ifelse(fc<4,ifelse(mc<4,"C1","C2"),ifelse(mc<4,"D1","D2"))),STS)}
TF_cus = TF_X %>%
group_by(cid) %>%
summarise(
status=Status(R_score, F_score, M_score))%>%
data.frame() %>%
merge(TF_cus,all.x = T)規則分群說明
A1 A2 B1 B2 C1 C2 D1 D2
5896 4368 6090 3156 6744 5017 594 333
原本 A1 A2 B1 B2 C1 C2 D1 D2
7033 2255 4757 5504 7082 3007 488 2140
- Clustering visualizations
#看不同年齡層內的客群分布占比
GP01 = TF_cus %>% ggplot()+
geom_bar(aes(x=age,fill=status),position = "fill")+
labs(y="percentage",size=10)+
scale_y_continuous(breaks = c(0,0.25,0.5,0.75,1),labels = c("0%","25%","50%","75%","100%"))+
scale_fill_discrete(name="Group")
ggplotly(GP01)以我們這次行銷的重點客群B2與C1來看:B2客群為35-45歲居多,而C1客群則為小於25歲以及65歲以上居多。
#居住地與分群
table(TF_cus$area,TF_cus$status)%>%
{./rowSums(.)}%>%
as.data.frame.matrix()%>%
d3heatmap(F,F,colors = "Blues")住在115的B1客群(流失)顧客最多,而我們的重點顧客群B2主要住在115以及221,C1則主要是住在105及110的顧客較多。
TF0 = TF; TF_oid0 = TF_oid; TF_cus0 = TF_cus ; TF_RFM0 =TF_RFM ; TF_X0 =TF_X
save(TF0, TF_oid0, TF_cus0, TF_RFM0, TF_X0,
file="/Users/shuushoden/Desktop/TF_Final_part1.rdata")資料框切割 Dateframe Splitting
製作預測變數 Feature engineering
[1] 618211
X = TF %>% group_by(oid) %>%
summarise(
date = first(date),
cid = first(cid),
age = first(age),
items = n(),
pieces = sum(amount),
total = sum(price),
gross = sum(price - cost)
) %>%
left_join(TF_oid[c(1,5)], by = "oid") %>%
as.data.frame()
nrow(X) #88387[1] 88387
oid date cid age
Min. : 1 Min. :2000-11-01 Min. : 1069 Length:88387
1st Qu.:22122 1st Qu.:2000-11-23 1st Qu.: 923910 Class :character
Median :44238 Median :2000-12-12 Median : 1607000 Mode :character
Mean :44255 Mean :2000-12-15 Mean : 1395768
3rd Qu.:66390 3rd Qu.:2001-01-12 3rd Qu.: 1888874
Max. :88527 Max. :2001-01-31 Max. :20002000
items pieces total gross
Min. : 1.00 Min. : 1.00 Min. : 5 Min. :-1645
1st Qu.: 2.00 1st Qu.: 3.00 1st Qu.: 230 1st Qu.: 23
Median : 5.00 Median : 6.00 Median : 522 Median : 72
Mean : 6.99 Mean : 9.45 Mean : 889 Mean : 138
3rd Qu.: 9.00 3rd Qu.: 12.00 3rd Qu.: 1120 3rd Qu.: 174
Max. :112.00 Max. :339.00 Max. :30171 Max. : 8069
area
Length:88387
Class :character
Mode :character
Check Quantile and Remove Outlier
items pieces total gross
99.9% 56.000 84.00 9378.7 1883.2
99.95% 64.000 98.00 11261.8 2317.1
99.99% 85.646 137.65 17699.3 3389.6
# outliers removal for items, pieces, total and gross (99.95%)
X = subset(X, items<=64 & pieces<=98 & total<=11261.8 & gross <= 2317.1)
nrow(X) # 88387 -> 88285[1] 88285
Customer Records Aggregation (in Nov, Dec, Jan) X
d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cid) %>%
summarise(
r = min(days), # recency
f = n(), # frequency
m = mean(total), # monetary
s = max(days), # seniority
rev = sum(total), # total revenue contribution
value = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% as.data.frame() # 28579
nrow(A)[1] 28579
### Prepare for the Target variable (Y) ##### Customer Records Aggregation (in Feb) Y
feb = TF_oid0 %>%
filter(date >= feb01) %>%
group_by(cid) %>%
summarise(amount = sum(total)) #16898
nrow(feb)[1] 16898
Target variable for regression A$amount
Target variable for regression A$buy
Mode FALSE TRUE
logical 15342 13237
Association of Categorial Predictors
CONTEST DATASET(TONY)
TF_oid = subset(TF_oid, cid %in% A$cid & date < as.Date("2001-02-01"))
TF = subset(TF, cid %in% A$cid & date < as.Date("2001-02-01"))#
set.seed(2008); spl = sample.split(A$buy, SplitRatio = 0.7)
c(nrow(A), sum(spl), sum(!spl)) #28579;20005;8574[1] 28579 20005 8574
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)#
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2)) #13237;9266;3971[1] 13237 9266 3971
save(TF_oid, TF, X, Status,STS, A, spl, spl2, file="/Users/shuushoden/Desktop/TF_Final_part2.rdata")預測模型 Modeling
Classification Model
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
11)])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.512 -0.870 -0.718 1.040 1.862
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.2844904 0.1270650 -10.11 <0.0000000000000002 ***
r -0.0109725 0.0009117 -12.04 <0.0000000000000002 ***
f 0.3207082 0.0167176 19.18 <0.0000000000000002 ***
m -0.0000378 0.0000285 -1.33 0.1836
s 0.0088524 0.0009290 9.53 <0.0000000000000002 ***
rev 0.0000460 0.0000201 2.29 0.0220 *
value -0.0002831 0.0000879 -3.22 0.0013 **
agea25 -0.0384522 0.0867776 -0.44 0.6577
agea30 0.0668997 0.0798397 0.84 0.4021
agea35 0.1052453 0.0792223 1.33 0.1840
agea40 0.1023404 0.0816900 1.25 0.2103
agea45 0.0321231 0.0849494 0.38 0.7053
agea50 -0.0121775 0.0935525 -0.13 0.8964
agea55 0.1263051 0.1107591 1.14 0.2541
agea60 0.1459275 0.1178577 1.24 0.2157
agea65 0.1670418 0.1031885 1.62 0.1055
areax106 -0.0620422 0.1325359 -0.47 0.6397
areax110 -0.1799525 0.1043431 -1.72 0.0846 .
areax114 -0.0381595 0.1120629 -0.34 0.7335
areax115 0.2273651 0.0969914 2.34 0.0191 *
areax221 0.0948507 0.0977257 0.97 0.3318
areaxOthers -0.0722869 0.1043960 -0.69 0.4887
areaxUnknown -0.1234381 0.1262871 -0.98 0.3284
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27083 on 19611 degrees of freedom
Residual deviance: 22813 on 19589 degrees of freedom
(393 observations deleted due to missingness)
AIC: 22859
Number of Fisher Scoring iterations: 5
predict
actual FALSE TRUE
FALSE 3649 864
TRUE 1660 2236
[1] 0.69985
[,1]
FALSE vs. TRUE 0.74441
Regression Model
# for those who would buy, predict their spending amount
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
Call:
lm(formula = amount ~ ., data = TR2[, c(2:6, 7:10)])
Residuals:
Min 1Q Median 3Q Max
-2.0100 -0.2242 0.0493 0.2845 1.4520
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.38429200 0.05486197 25.23 < 0.0000000000000002 ***
r 0.00006086 0.00031560 0.19 0.84708
f 0.01922055 0.00207236 9.27 < 0.0000000000000002 ***
m 0.44675175 0.03905153 11.44 < 0.0000000000000002 ***
s 0.00023283 0.00031736 0.73 0.46317
rev 0.02550641 0.03730768 0.68 0.49420
value 0.00008033 0.00000909 8.83 < 0.0000000000000002 ***
agea25 0.04055068 0.02492514 1.63 0.10379
agea30 0.09487483 0.02310185 4.11 0.00004047 ***
agea35 0.11795111 0.02269856 5.20 0.00000021 ***
agea40 0.09665095 0.02330489 4.15 0.00003396 ***
agea45 0.08414328 0.02412993 3.49 0.00049 ***
agea50 0.07909778 0.02631518 3.01 0.00266 **
agea55 0.06409954 0.03099276 2.07 0.03865 *
agea60 0.02615749 0.03260789 0.80 0.42247
agea65 -0.03087909 0.02856767 -1.08 0.27977
areax106 0.04225606 0.04334562 0.97 0.32965
areax110 0.02939456 0.03496356 0.84 0.40053
areax114 -0.01942647 0.03678731 -0.53 0.59746
areax115 -0.02041485 0.03225261 -0.63 0.52677
areax221 -0.00028884 0.03248711 -0.01 0.99291
areaxOthers -0.01028797 0.03469734 -0.30 0.76685
areaxUnknown -0.02265367 0.03990501 -0.57 0.57026
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.424 on 9071 degrees of freedom
(172 observations deleted due to missingness)
Multiple R-squared: 0.286, Adjusted R-squared: 0.284
F-statistic: 165 on 22 and 9071 DF, p-value: <0.0000000000000002
Df Sum Sq Mean Sq F value Pr(>F)
TR2$area 7 8 1.082 4.29 0.000095 ***
Residuals 9256 2334 0.252
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2 observations deleted due to missingness
Warning in chisq.test(table(TR2$area, TR2$amount)): Chi-squared approximation
may be incorrect
Pearson's Chi-squared test
data: table(TR2$area, TR2$amount)
X-squared = 24900, df = 24500, p-value = 0.042
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)[1] 0.28574 NA
Prediction
d0 = max(X$date) + 1
B = X %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cid) %>%
summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
value = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% as.data.frame()
nrow(B) # 23466[1] 23466
B0 = B %>% group_by(cid) %>%
mutate(R_score = as.numeric(ifelse(between(r, 1, 7), '1',
ifelse(between(r, 7, 18), '2',
ifelse(between(r, 18, 38), '3',
ifelse(between(r, 38, 74), '4',
ifelse(between(r, 74, 120), '5', '>5'))))))) %>%
mutate(F_score = as.numeric(ifelse(between(f, 1, 1), '1',
ifelse(between(f, 1, 2), '2',
ifelse(between(f, 2, 3), '3',
ifelse(between(f, 3, 5), '4',
ifelse(between(f, 5, 85), '5', '>5'))))))) %>%
mutate(M_score = as.numeric(ifelse(between(m, 8, 309.04), '1',
ifelse(between(m, 309.04, 558), '2',
ifelse(between(m, 558, 890), '3',
ifelse(between(m,890, 1487), '4',
ifelse(between(m, 1487, 10532), '5', '>5'))))))) %>%
summarise(status=Status(R_score, F_score, M_score)) %>% as.data.frame() [1] FALSE
Warning: Ignoring unknown parameters: binwidth, bins, pad
在我們所分出的B2客群中,30-40歲為主要年齡區間。
Warning: Ignoring unknown parameters: binwidth, bins, pad
在我們所分出的C1客群中,30-45歲為主要年齡區間,特別的是大於65歲的族群並未如B2隨年齡增加而減少。
行銷企劃 Target Marketing proposal
以下我們經過討論後,欲針對B2與C1這兩個客群做為我們的重點行銷對象。
原因在於利用RFM的我們分類出的D群,無論是D1,D2對公司目前的貢獻都是最有價值的,
而最接近的C群則是仍與公司保持一定接觸程度的客群,只是在消費頻率上不及D群,
其中我們特別想關注在同樣消費頻率級別上,平均消費金額較低的C1,
在不改變現有的近期造訪時間與頻率情況下,透過針對性的行銷活動,我們相信可以將一部份的C1轉為C2;
再來是B族群,對我們來說,流失的高現金貢獻的族群對公司會有相當程度的傷害,
所以我們希望一部份的行銷企劃可以針對此族群。
而廣泛的行銷企劃不僅可以適用在我們的焦點族群也適用於其他族群,只須額外設計幾個針對B2的行銷手段以達到挽留B2的目的。
C1 新出現的嚐鮮顧客
顧客側寫:
+ 最近常出現(R高)、消費頻率低(F低)、消費金額低(M低)
+ 小於25歲者:初入社會的上班族,生活匆忙沒有多餘的空閒時間,買給長輩當禮物,或跟風買的,但買回家後發現沒時間用。
+ 大於55歲者:注重健康,注重生活品質,空閒時間很多,買來試試回家後發現不會使用,或只會用來製作一種食譜。
行銷方案:
+ 請古娃娃等youtuber業配,主題為10分鐘出門挑戰,像是起床後10分鐘內即可梳妝完畢並做好早餐(如:豆漿…)出門,並在影片最後提供line@官方帳號連結與QRcode。
+ 在line@中定時推播,主要內容包含各產品介紹、 食譜(5分鐘輕鬆做早餐、一次做好一週的早餐、營養果汁取代多糖飲料…)
+ 在特殊節日(如:父親節、母親節…)或通路檔期搭配禮盒販售(調理機本體+備用馬達+組合刀片),並提供產品讓通路配合活動回饋給顧客(如:抽獎)
+ 每年舉辦兩場廚藝講座,3個月內累積購買金額達3500元即可免費參加,主要內容為調理機使用教學及各種特色食譜(如:芋頭牛奶凍…),參加者可直接在講座結束後可參加抽獎活動(獎品:折價券、果汁機…),現場購買產品以及特色食譜,當下凡購買產品即贈特色食譜一本。
B2 可能流失的前忠實顧客
顧客側寫:
+ 近期少來光顧(R低)、消費頻率高(F高)、消費金額低(M高)
+ 我們認為這個消費族群高度注重健康,注重生活品質,平時有自己食用調理機或果汁機料理食物的習慣。推測最近不太來是因為產品曝光度不高或是沒有收到新產品資訊。
行銷方案:
+ 透過簡訊對老顧客發送折價券,憑當日購買產品的發票,可免費獲得新食譜(如:100種方法讓小孩不再挑食、提神別再依靠咖啡,20種菜單讓你精神馬上來…)一本。
+ 找阿基師代言
+ 當日購買金額滿1500元可獲得廚藝教室免費入場券一張(未滿1500須支付器材清潔費),教室內提供免費食材、器材與食譜,顧客可以自行決定要做哪一個食譜,親自操作體驗新產品與相關配件,當日參與的顧客可現場以較優惠的價格購買產品及相關配件。
+ 在line@中定時推播,主要內容包含各產品介紹、 食譜(5分鐘輕鬆做早餐、一次做好一週的早餐、營養果汁取代多糖飲料…)。
Simulate for B2 and C1 respectively
B2
A1 A2 B1 B2 C1 C2 D1 D2
3011 2452 5896 3091 4547 3553 566 320
Assumption 1 : Fixed cost & Fixed repurchasing rate (k1)
#B2總共3091人,預計B2行銷預算為15萬元
cost = 50
k1 = 0.75 # fix effect on the probability of retaintion
par(mar=c(4,3,3,2), cex=0.8)
PI = R0*MRG*pmax(0,k1 - P0) - cost
hist(PI, xlim=c(-100, 200), breaks=seq(-500,1000,10),main="淨期望報償分布(B2)")Assumption 2 : 固定成本、增加回購機率(k2)
k2 = 0.3 # max. incremental effect of instrument
cost = 50 #設定成本為50元/人
par(mar=c(4,3,3,2), cex=0.8)
PI = MRG * R0 * ifelse(P0<=(1-k2), k2, 1-P0) - cost
hist(PI, xlim=c(-50, 200), breaks=seq(-500,1000,10),main="淨期望報償分布(B2)")k2 = 0.3 # max. incremental effect of instrument
cost = 50 #設定成本為50元/人
par(mar=c(4,3,3,2), cex=0.8)
PI = MRG * R0 * ifelse(P0<=(1-k2), k2, 1-P0) - cost
hist(PI, xlim=c(-50, 200), breaks=seq(-500,1000,10),main="淨期望報償分布(B2)")計算工具在B2客群的效益
#B2 AvgROI [40,41]元
B = B %>% mutate(
PI = MRG*Rev*ifelse(Buy<=(1-k2), k2, 1-Buy) - cost
) %>%
na.omit() #23432
B %>% group_by(status) %>%
summarise(
Group.Sz = n(),
No.Target = sum(PI>0),
AvgROI = mean(PI[PI>0]),
TotalROI = sum(PI[PI>0])) %>%
arrange(No.Target) %>%
data.frame status Group.Sz No.Target AvgROI TotalROI
1 D1 545 15 10.9173 163.76
2 C1 4456 34 8.7707 298.20
3 A1 2963 92 10.0603 925.55
4 B1 5755 195 10.9414 2133.57
5 D2 315 231 39.8097 9196.05
6 A2 2416 1777 48.5319 86241.26
7 B2 3055 2036 40.1942 81835.39
8 C2 3500 2585 50.7898 131291.68
Assumption 1 : Fixed cost & Fixed repurchasing rate (k1)
#C1總共4547人,預計C1行銷預算為8萬元
cost = 17.594
k1 = 0.75 # fix effect on the probability of retaintion
par(mar=c(4,3,3,2), cex=0.8)
PI = R0*MRG*pmax(0,k1 - P0) - cost
hist(PI, xlim=c(-100, 200), breaks=seq(-500,1000,10),main="淨期望報償分布(C1)")Assumption 2 : 固定成本、增加回購機率(k2)
k2 = 0.3 # max. incremental effect of instrument
cost = 17.594 #設定成本為17.594元/人
par(mar=c(4,3,3,2), cex=0.8)
PI = MRG * R0 * ifelse(P0<=(1-k2), k2, 1-P0) - cost
hist(PI, xlim=c(-50, 200), breaks=seq(-500,1000,10),main="淨期望報償分布(C1)")k2 = 0.3 # max. incremental effect of instrument
cost = 17.594 #設定成本為17.594元/人
par(mar=c(4,3,3,2), cex=0.8)
PI = MRG * R0 * ifelse(P0<=(1-k2), k2, 1-P0) - cost
hist(PI, xlim=c(-50, 200), breaks=seq(-500,1000,10),main="淨期望報償分布(C1)")計算工具在C1客群的效益
#C1 AvgROI 3.6493 元
B = B %>% mutate(
PI = MRG*Rev*ifelse(Buy<=(1-k2), k2, 1-Buy) - cost
) %>%
na.omit()
B %>% group_by(status) %>%
summarise(
Group.Sz = n(),
No.Target = sum(PI>0),
AvgROI = mean(PI[PI>0]),
TotalROI = sum(PI[PI>0])) %>%
arrange(No.Target) %>%
data.frame status Group.Sz No.Target AvgROI TotalROI
1 D1 545 273 13.690 3737.5
2 D2 315 304 60.286 18327.0
3 A1 2963 1507 13.346 20111.9
4 C1 4456 2096 10.660 22344.2
5 A2 2416 2342 66.440 155603.6
6 B2 3055 2910 56.577 164639.6
7 B1 5755 3004 13.920 41814.5
8 C2 3500 3443 68.466 235728.9
C1 模擬器
成效總結
就以上的模擬器,我們評估我們的行銷策略的預期成效為下:
- B2 size: 3090
- 預計投入成本: NTD 150,000
- 獲得的總ROI: NTD 90,037
- 平均ROI: NTD 41.87
- C1 size: 4547
- 預計投入成本: NTD 80,000
- 獲得的總ROI: NTD 23,162
- 平均ROI: NTD 10.823